In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import norm, lognorm, bernoulli
from scipy.stats.mstats import mquantiles
from patsy import dmatrices
from sklearn.preprocessing import StandardScaler
%matplotlib inline
plt.style.use('bmh')
In [ ]:
diamonds = sm.datasets.get_rdataset('diamonds', 'ggplot2')
In [ ]:
# print(diamonds.__doc__)
In [ ]:
df = diamonds.data # extract the data - this returns a Pandas dataframe
In [ ]:
df.describe(include='all')
In [ ]:
plt.scatter(df.carat, df.price, alpha = 0.3)
plt.xlim(0,6)
plt.ylim(0,20000);
In [ ]:
df_transformed = df
In [ ]:
df_transformed['log_price'] = np.log(df_transformed.price)
In [ ]:
df_transformed['log_carat']=np.log(df_transformed.carat)
In [ ]:
plt.scatter(df_transformed.log_carat, df_transformed.log_price, alpha=0.3)
plt.ylim(5.5, 10);
In [ ]:
# previously built design matrices and model
y, X = dmatrices('log_price~log_carat+cut+color+clarity', data=df_transformed, return_type='matrix')
In [ ]:
multi_model_1 = sm.OLS(y, X)
multi_results_1=multi_model_1.fit()
print(multi_results_1.summary())
The condition number is the square root of the quotient of the maximum eigenvalue by the minimum eigenvalue of the variance-covariance matrix. Large values indicate that small changes in the original matrix could lead to large changes in the computed inverse. This inflates standard errors and throws into question the computed model coefficients.
We talked about dummy-variable ('one-hot') encoding last time. An alternative approach to this specifically designed for dealing with ordinal variables (categorical variables with an order) is orthogonal polynomial encoding. Some software (R, for instance) does this automatically when order is detected; patsy needs your instruction. This is beyond our scope right now, but see http://www.ats.ucla.edu/stat/r/library/contrast_coding.htm#ORTHOGONAL and http://statsmodels.sourceforge.net/devel/contrasts.html for more details.
In [ ]:
# orthogonal polynomial encoding - automatic in R
# note the order of y and X again
y_orth, X_orth = dmatrices('log_price~log_carat+C(cut, Poly)+C(clarity, Poly)+C(color, Poly)', data=df_transformed, return_type='matrix')
In [ ]:
X[:10,:10]
In [ ]:
multi_model_2 = sm.OLS(y_orth, X_orth)
multi_results_2=multi_model_2.fit()
print(multi_results_2.summary())
In [ ]: